Spain Occupational Accident Analysis (SODA)

Business Analytics Project

Author

Luis Gentner

Published

May 19, 2023

library(conflicted)
library(tidyverse)
library(scales)
library(ggrepel)
library(gghighlight)
library(mapSpain)
library(ESdata)

conflicts_prefer(
  dplyr::filter(),
  dplyr::select(),
  dplyr::lag(),
)

Introduction

This report covers the analysis of occupational accidents, also known as work accidents, in Spain

Data Exploration

Work Accident Data

load("data/ATR/ATR-I.1.3.RData")
atr |>
  filter(sector == "Total",
         ccaa == "ES") |>
  ggplot(aes(date, accidents)) +
  geom_line() +
  geom_point()

atr |>
  filter(ccaa == "ES") |>
  ggplot(aes(x = date, y = accidents, color = sector)) +
  geom_line() +
  geom_point(size=2) +
    labs(x = "Year",
         y = "Work accidents per 100k workers",
         title = "Work accidents in Spain from 2009 to 2021")

atr |>
  filter(ccaa == "ES") |>
  ggplot(aes(x = date, y = accidents, color = sector)) +
  geom_line() +
  geom_point() +
  gghighlight(sector != "total", label_key = sector, use_group_by = FALSE) +
  facet_wrap(~sector)

atr |>
  filter(sector == "Total",
         year == 2021) |>
  ggplot(aes(x = accidents, y = fct_reorder(ccaa_name, accidents))) +
  geom_col() +
  labs(x = "Work accidents per 100k workers",
       y = NULL,
       title = "Work accidents in 2021 per autonomous community")

Add year over year change

atr <- atr |>
  mutate(acc_yoy = (accidents - lag(accidents)) / lag(accidents),
         .by = c(sector, ccaa))
atr |>
  filter(ccaa == "ES") |>
  drop_na() |>
  ggplot(aes(x = date, y = acc_yoy, color = sector)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = label_percent()) +
  labs(x = "Year",
       y = "Year over year change",
       title = "Work accidents in Spain from 2009 to 2021")

atr |>
  filter(ccaa == "ES") |>
  drop_na() |>
  ggplot(aes(x = date, y = acc_yoy, fill = sector)) +
  geom_col() +
  facet_wrap(~sector) +
  scale_y_continuous(labels = label_percent()) +
  labs(x = "Year",
       y = "Year over year change",
       title = "Work accidents in Spain from 2009 to 2021") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))

Working population

work_pop <- epa_sector |>
  filter(edad == "total",
         sexo == "total") |>
  select(-edad, -sexo)
work_pop |>
  filter(region == "ES") |>
  ggplot(aes(x = periodo, y = valor, color = sector)) +
  geom_line() +
  geom_point() +
  labs(x = "Year", y = "100k workers",
       title = "Active working population in Spain")

Working population per sector

# Add relative work pop per sector
work_pop <- work_pop |>
  mutate(rel = valor / valor[sector == "total"],
         .by = c(periodo, region))

work_pop |>
  # Add full AC names
  left_join(ccaa_iso, by = join_by(region == iso)) |>
  filter(periodo == "2023-01-31",
         sector != "total",
         !region %in% c("ES-ML", "ES-CE")) |>
  ggplot(aes(rel,  fct_reorder(nombres, rel), fill = sector)) +
  geom_col() +
  scale_x_continuous(labels = label_percent()) +
  labs(x = "Percentage", y = NULL,
       title = "Relative working population per sector")

Non-EU-working population

work_noEU <- epa_nac |>
  # Only view all sexes and working population
  filter(sexo == "total",
         dato == "act") |>
  select(-sexo, -dato) |>
  # Compute relative amount of workforce per nationality
  mutate(rel = valor / valor[nac == "total"],
         .by = c(region, periodo)) |>
  # Only keep no-UE
  filter(nac == "no-UE") |>
  select(-valor, -nac) |>
  rename(noEU_rel = rel)

work_noEU |> head()

Plot non-EU workforce per AC:

work_noEU |>
  filter(periodo == "2023-01-31") |>
  # Add full AC names
  left_join(ccaa_iso, by = join_by(region == iso)) |>
  ggplot(aes(x = noEU_rel, y = fct_reorder(nombres, noEU_rel))) +
  geom_col() +
  scale_x_continuous(labels = label_percent()) +
  labs(x = "Percentage",
       y = NULL,
       title = "Non-EU working population in 2023 per autonomous community")

GDP

agr = c("vab_A", "vab_BE", "vab_F", "vab_GT", "pib")
agr_rename = c(
  "agricultura" = "vab_A",
  "industria" = "vab_BE",
  "construccion" = "vab_F",
  "servicios" = "vab_GT",
  "total" = "pib"
)
gdp <- pib_pm_oferta |>
  filter(agregado %in% agr,
         tipo == "indice",
         dato == "base",
         ajuste == "ajustado") |>
  mutate(month = month(periodo), year = year(periodo)) |>
  filter(month == 6) |>
  # Remove identical elements
  distinct(across(periodo:dato), .keep_all = TRUE) |>
  select(periodo, agregado, valor) |>
  rename(date = periodo, sector = agregado, gdp = valor) |>
  mutate(sector = fct_recode(sector, !!!agr_rename))
gdp |>
  ggplot(aes(date, gdp, color = sector)) +
  geom_line()

Inflation

infl <- ipc_ccaa |>
  # Only extract annual inflation and price index over all groups
  filter(grupo == "general",
         dato %in% c("anual", "indice")) |>
  select(-grupo) |>
  pivot_wider(names_from = dato, values_from = valor) |>
  rename(infl_cpi = indice, infl_ann = anual) |>
  # Annual inflation in decimal format
  mutate(infl_ann = infl_ann / 100)

infl |> head()

Map Plots

# Get Spanish ACs as simple features
ccaa <- esp_get_ccaa()
can_box <- esp_get_can_box()

atr_2021 <- atr |>
  filter(year == 2021,
         ccaa != "ES",
         sector == "Total") |>
  mutate(ccaa = as.character(ccaa))

# Join SF and work accidents data
ccaa_atr <- ccaa |>
  inner_join(atr_2021, by = join_by(iso2.ccaa.code == ccaa))

ggplot(ccaa_atr) +
  # Plot ACs
  geom_sf(aes(fill = accidents),
          color = "grey70",
          linewidth = .3) +
  # Plot canaries box
  geom_sf(data = can_box, color = "grey70") +
  # Plot labels with accident number
  geom_label_repel(
    aes(label = round(accidents), geometry = geometry),
    stat = "sf_coordinates",
    fill = alpha(c("white"), 0.5),
    color = "black",
    size = 3,
    label.size = 0
  ) +
  # Adjust color scale
  scale_fill_distiller(palette = "Blues", direction = 1) +
  theme_void() +
  theme(legend.position = c(0.12, 0.6)) +
  labs(x = NULL,
       y = NULL,
       title = "Work accidents in 2021 per autonomous community",
       fill = "Accidents per\n100k workers")

library(plotly)
library(sf)


ccaa <- esp_get_ccaa() |> st_cast("MULTIPOLYGON")
ccaa_atr <- ccaa |>
  inner_join(atr_2021, by = join_by(iso2.ccaa.code == ccaa))

p <- ggplot(ccaa_atr) +
  geom_sf(aes(fill = accidents,
              text = paste0(ccaa_name, ":\n", round(accidents))),
          color = "grey70",
          linewidth = .3) +
  geom_sf(data = can_box, color = "grey70") +
  scale_fill_distiller(palette = "Blues", direction = 1) +
  theme_void() +
  labs(x = NULL,
       y = NULL,
       title = "Work accidents in 2021 per autonomous community",
       fill = "Accidents per\n100k workers")

ggplotly(p, tooltip = "text") |> style(hoveron = "fill")